library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.4.4 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(GGally) # ggplot2-based visualization of correlations
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
library(factoextra) # ggplot2-based visualization of pca
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(countrycode)
library(rworldmap)
## Loading required package: sp
## ### Welcome to rworldmap ###
## For a short introduction type : vignette('rworldmap')
library(mice)
##
## Attaching package: 'mice'
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following objects are masked from 'package:base':
##
## cbind, rbind
library(plotly)
##
## Attaching package: 'plotly'
##
## The following object is masked from 'package:ggplot2':
##
## last_plot
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following object is masked from 'package:graphics':
##
## layout
library(readr)
library(cluster)
library(mclust)
## Package 'mclust' version 6.0.1
## Type 'citation("mclust")' for citing this R package in publications.
##
## Attaching package: 'mclust'
##
## The following object is masked from 'package:purrr':
##
## map
rm(list = ls())
lifeexpec = read.csv("./data/lifeexpec.csv")
obesity = read.csv("./data/obesity.csv")
tobacco = read.csv("./data/tobacco.csv")
doctor_density = read.csv("./data/doctor_density.csv")
gob_expenditure = read.csv("./data/gov_expenditure.csv")
road_death = read.csv("./data/road_death.csv")
birth_by_skilled = read.csv("./data/birth_by_skilled.csv")
maternal_death = read.csv("./data/maternal_death.csv")
lifeexpec = lifeexpec |> select(GEO_NAME_SHORT, DIM_TIME, VALUE_NUMERIC)
lifeexpec |> distinct(DIM_TIME)
## DIM_TIME
## 1 2000
## 2 2010
## 3 2015
## 4 2019
lifeexpec = lifeexpec |> filter(DIM_TIME == "2015")
colnames(lifeexpec) = c("country", "year", "life_value")
lifeexpec <- lifeexpec |> distinct(country, .keep_all = TRUE)
obesity = obesity |> select(GEO_NAME_SHORT, DIM_TIME, VALUE_NUMERIC)
obesity |> distinct(DIM_TIME)
## DIM_TIME
## 1 1975
## 2 1976
## 3 1977
## 4 1978
## 5 1979
## 6 1980
## 7 1981
## 8 1982
## 9 1983
## 10 1984
## 11 1985
## 12 1986
## 13 1987
## 14 1988
## 15 1989
## 16 1990
## 17 1991
## 18 1992
## 19 1993
## 20 1994
## 21 1995
## 22 1996
## 23 1997
## 24 1998
## 25 1999
## 26 2000
## 27 2001
## 28 2002
## 29 2003
## 30 2004
## 31 2005
## 32 2006
## 33 2007
## 34 2008
## 35 2009
## 36 2010
## 37 2011
## 38 2012
## 39 2013
## 40 2014
## 41 2015
## 42 2016
obesity = obesity |> filter(DIM_TIME == "2015")
obesity = obesity |> select(-DIM_TIME)
colnames(obesity) = c("country", "obesity_value")
obesity <- obesity |> distinct(country, .keep_all = TRUE)
tobacco = tobacco |> select(GEO_NAME_SHORT, DIM_TIME, VALUE_NUMERIC)
tobacco |> distinct(DIM_TIME)
## DIM_TIME
## 1 2000
## 2 2005
## 3 2010
## 4 2015
## 5 2018
## 6 2019
## 7 2020
## 8 2025
tobacco = tobacco |> filter(DIM_TIME == "2015")
tobacco = tobacco |> select(-DIM_TIME)
colnames(tobacco) = c("country", "tobacco_value")
tobacco <- tobacco |> distinct(country, .keep_all = TRUE)
doctor_density = doctor_density |> select(GEO_NAME_SHORT, DIM_TIME, VALUE_NUMERIC)
doctor_density |> distinct(DIM_TIME)
## DIM_TIME
## 1 1990
## 2 1991
## 3 1992
## 4 1993
## 5 1994
## 6 1995
## 7 1996
## 8 1997
## 9 1998
## 10 1999
## 11 2000
## 12 2001
## 13 2002
## 14 2003
## 15 2004
## 16 2005
## 17 2006
## 18 2007
## 19 2008
## 20 2009
## 21 2010
## 22 2011
## 23 2012
## 24 2013
## 25 2014
## 26 2015
## 27 2016
## 28 2017
## 29 2018
## 30 2019
## 31 2020
## 32 2021
doctor_density = doctor_density |> filter(DIM_TIME == "2015")
doctor_density = doctor_density |> select(-DIM_TIME)
colnames(doctor_density) = c("country", "doc_den_value")
doctor_density <- doctor_density |> distinct(country, .keep_all = TRUE)
gob_expenditure = gob_expenditure |> select(GEO_NAME_SHORT, DIM_TIME, VALUE_NUMERIC)
gob_expenditure |> distinct(DIM_TIME)
## DIM_TIME
## 1 2000
## 2 2001
## 3 2002
## 4 2003
## 5 2004
## 6 2005
## 7 2006
## 8 2007
## 9 2008
## 10 2009
## 11 2010
## 12 2011
## 13 2012
## 14 2013
## 15 2014
## 16 2015
## 17 2016
## 18 2017
## 19 2018
## 20 2019
## 21 2020
gob_expenditure = gob_expenditure |> filter(DIM_TIME == "2015")
gob_expenditure = gob_expenditure |> select(-DIM_TIME)
colnames(gob_expenditure) = c("country", "gob_expenditure")
gob_expenditure <- gob_expenditure |> distinct(country, .keep_all = TRUE)
road_death = road_death |> select(GEO_NAME_SHORT, DIM_TIME, VALUE_NUMERIC)
road_death |> distinct(DIM_TIME)
## DIM_TIME
## 1 2000
## 2 2001
## 3 2002
## 4 2003
## 5 2004
## 6 2005
## 7 2006
## 8 2007
## 9 2008
## 10 2009
## 11 2010
## 12 2011
## 13 2012
## 14 2013
## 15 2014
## 16 2015
## 17 2016
## 18 2017
## 19 2018
## 20 2019
road_death = road_death |> filter(DIM_TIME == "2015")
road_death = road_death |> select(-DIM_TIME)
colnames(road_death) = c("country", "road_death")
road_death <- road_death |> distinct(country, .keep_all = TRUE)
birth_by_skilled = birth_by_skilled |> select(GEO_NAME_SHORT, DIM_TIME, VALUE_NUMERIC)
birth_by_skilled |> distinct(DIM_TIME)
## DIM_TIME
## 1 2001
## 2 2002
## 3 2003
## 4 2004
## 5 2006
## 6 2007
## 7 2008
## 8 2009
## 9 2011
## 10 2012
## 11 2013
## 12 2014
## 13 2016
## 14 2017
## 15 2018
## 16 2019
## 17 2021
## 18 2000
## 19 2005
## 20 2010
## 21 2015
## 22 2020
## 23 2022
## 24 2019-2020
## 25 2016-2017
## 26 2013-2014
## 27 2011-2012
## 28 2010-2011
## 29 2008-2009
## 30 2017-2018
## 31 2012-2013
## 32 2018-2019
## 33 2006-2007
## 34 2015-2016
## 35 1999-2000
## 36 2003-2004
## 37 2009-2010
## 38 2005-2006
## 39 2021-2022
## 40 2014-2015
## 41 2004-2005
## 42 2002-2003
## 43 2000-2001
## 44 2007-2008
## 45 2019-2021
## 46 2013-2015
## 47 2001-2003
## 48 2001-2002
## 49 2016-2018
## 50 2004-2006
## 51 2012-2014
## 52 2020-2021
## 53 2010-2013
birth_by_skilled = birth_by_skilled |> filter(DIM_TIME == "2015")
birth_by_skilled = birth_by_skilled |> select(-DIM_TIME)
colnames(birth_by_skilled) = c("country", "birth_by_skilled")
birth_by_skilled <- birth_by_skilled |> distinct(country, .keep_all = TRUE)
maternal_death = maternal_death |> select(GEO_NAME_SHORT, DIM_TIME, VALUE_NUMERIC)
maternal_death |> distinct(DIM_TIME)
## DIM_TIME
## 1 1985
## 2 1986
## 3 1987
## 4 1988
## 5 1989
## 6 1990
## 7 1991
## 8 1992
## 9 1993
## 10 1994
## 11 1995
## 12 1996
## 13 1997
## 14 1998
## 15 1999
## 16 2000
## 17 2001
## 18 2002
## 19 2003
## 20 2004
## 21 2005
## 22 2006
## 23 2007
## 24 2008
## 25 2009
## 26 2010
## 27 2011
## 28 2012
## 29 2013
## 30 2014
## 31 2015
## 32 2016
## 33 2017
## 34 2018
## 35 2019
## 36 2020
maternal_death = maternal_death |> filter(DIM_TIME == "2015")
maternal_death = maternal_death |> select(-DIM_TIME)
colnames(maternal_death) = c("country", "maternal_death")
maternal_death <- maternal_death |> distinct(country, .keep_all = TRUE)
# input continent for grouping
country = lifeexpec |> select(country)
continent = read.csv("./data/Countries-Continents.csv")
continent$Country[continent$Country == "Korea, South"] <- "Republic of Korea"
name = left_join(country, continent, by = join_by("country" == "Country"))
name = na.omit(name)
rm(df)
## Warning in rm(df): object 'df' not found
df = left_join(lifeexpec, obesity, by = join_by(country))
df = left_join(df, tobacco, by = join_by(country))
df = left_join(df, doctor_density, by = join_by(country))
df = left_join(df, gob_expenditure, by = join_by(country))
df = left_join(df, road_death, by = join_by(country))
df = left_join(df, birth_by_skilled, by = join_by(country))
df = left_join(df, maternal_death, by = join_by(country))
df = left_join(df, name, by = join_by(country))
df = df |> relocate(Continent, .after = country)
df = df |> drop_na(Continent)
df = df |> rename("continent" = "Continent")
table(is.na(df))
##
## FALSE TRUE
## 1581 168
summary(df)
## country continent year life_value
## Length:159 Length:159 Min. :2015 Min. :47.67
## Class :character Class :character 1st Qu.:2015 1st Qu.:64.69
## Mode :character Mode :character Median :2015 Median :72.58
## Mean :2015 Mean :71.71
## 3rd Qu.:2015 3rd Qu.:77.74
## Max. :2015 Max. :84.29
##
## obesity_value tobacco_value doc_den_value gob_expenditure
## Min. : 2.10 Min. : 0.20 Min. : 0.315 Min. : 1.330
## 1st Qu.: 8.50 1st Qu.: 2.70 1st Qu.: 4.612 1st Qu.: 6.615
## Median :20.50 Median : 7.75 Median :22.679 Median : 9.340
## Mean :18.95 Mean :11.30 Mean :22.212 Mean : 9.987
## 3rd Qu.:25.40 3rd Qu.:19.68 3rd Qu.:33.852 3rd Qu.:12.890
## Max. :53.90 Max. :39.10 Max. :77.586 Max. :29.490
## NA's :2 NA's :21 NA's :65 NA's :3
## road_death birth_by_skilled maternal_death
## Min. : 0.00 Min. : 39.70 Min. : 1.314
## 1st Qu.: 6.26 1st Qu.: 96.30 1st Qu.: 10.786
## Median :14.24 Median : 99.05 Median : 52.027
## Mean :16.00 Mean : 94.31 Mean : 162.763
## 3rd Qu.:22.45 3rd Qu.: 99.90 3rd Qu.: 210.396
## Max. :63.37 Max. :100.00 Max. :1224.722
## NA's :77
dim(df)
## [1] 159 11
I obtained dataset from the World Health Organization.
sapply(df, function(x) sum(is.na(x))*100/nrow(df))
## country continent year life_value
## 0.000000 0.000000 0.000000 0.000000
## obesity_value tobacco_value doc_den_value gob_expenditure
## 1.257862 13.207547 40.880503 1.886792
## road_death birth_by_skilled maternal_death
## 0.000000 48.427673 0.000000
m = 4 # number of multiple imputations, we are going to make 4 iterations, we're going to predict missing values 4 times.
mice_mod = mice(df, m = m, method='rf') # machine learning tool, rf = random forest
##
## iter imp variable
## 1 1 obesity_value tobacco_value doc_den_value gob_expenditure birth_by_skilled
## 1 2 obesity_value tobacco_value doc_den_value gob_expenditure birth_by_skilled
## 1 3 obesity_value tobacco_value doc_den_value gob_expenditure birth_by_skilled
## 1 4 obesity_value tobacco_value doc_den_value gob_expenditure birth_by_skilled
## 2 1 obesity_value tobacco_value doc_den_value gob_expenditure birth_by_skilled
## 2 2 obesity_value tobacco_value doc_den_value gob_expenditure birth_by_skilled
## 2 3 obesity_value tobacco_value doc_den_value gob_expenditure birth_by_skilled
## 2 4 obesity_value tobacco_value doc_den_value gob_expenditure birth_by_skilled
## 3 1 obesity_value tobacco_value doc_den_value gob_expenditure birth_by_skilled
## 3 2 obesity_value tobacco_value doc_den_value gob_expenditure birth_by_skilled
## 3 3 obesity_value tobacco_value doc_den_value gob_expenditure birth_by_skilled
## 3 4 obesity_value tobacco_value doc_den_value gob_expenditure birth_by_skilled
## 4 1 obesity_value tobacco_value doc_den_value gob_expenditure birth_by_skilled
## 4 2 obesity_value tobacco_value doc_den_value gob_expenditure birth_by_skilled
## 4 3 obesity_value tobacco_value doc_den_value gob_expenditure birth_by_skilled
## 4 4 obesity_value tobacco_value doc_den_value gob_expenditure birth_by_skilled
## 5 1 obesity_value tobacco_value doc_den_value gob_expenditure birth_by_skilled
## 5 2 obesity_value tobacco_value doc_den_value gob_expenditure birth_by_skilled
## 5 3 obesity_value tobacco_value doc_den_value gob_expenditure birth_by_skilled
## 5 4 obesity_value tobacco_value doc_den_value gob_expenditure birth_by_skilled
## Warning: Number of logged events: 3
df_imput <- complete(mice_mod, action=m) # replace missiong value with the mice_mod
df_imput_n = df_imput[,4:ncol(df_imput)]
table(is.na(df_imput_n))
##
## FALSE
## 1272
name = df_imput$country
continent = df_imput$continent
library(corrplot)
## corrplot 0.92 loaded
corrplot(cor(df_imput_n), method = "number")
library(ggplot2)
library(tidyr)
df_long <- df_imput_n |> gather(variable, value)
# Create boxplot
ggplot(df_long, aes(x = variable, y = value)) +
geom_boxplot() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
theme_minimal()
# log transformation for the variable of maternal_death
df_imput$maternal_death = log(df_imput$maternal_death)
df_long <- df_imput[,4:ncol(df_imput)] |> gather(variable, value)
ggplot(df_long, aes(x = variable, y = value)) +
geom_boxplot() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
theme_minimal()
ggplot(df_imput, aes(x=obesity_value, y=life_value, group=continent)) +
geom_point(alpha=0.9) + #geom_smooth(se=F, size=0.3) +
facet_wrap(~ continent) +
scale_color_discrete() +
theme_minimal()+ theme(legend.position="none") +
labs(title = "World countries: life expectancy vs obesity",
caption="Source: World Health Organization",
x = "obesity_value", y = "Life expectancy at birth (in years)")
ggplot(df_imput, aes(x=tobacco_value, y=life_value, group=continent)) +
geom_point(alpha=0.9) + #geom_smooth(se=F, size=0.3) +
facet_wrap(~ continent) +
scale_color_discrete() +
theme_minimal()+ theme(legend.position="none") +
labs(title = "World countries: life expectancy vs tobacco usage",
caption="Source: World Health Organization",
x = "tobacco_value", y = "Life expectancy at birth (in years)")
ggplot(df_imput, aes(x=doc_den_value, y=life_value, group=continent)) +
geom_point(alpha=0.9) + #geom_smooth(se=F, size=0.3) +
facet_wrap(~ continent) +
scale_color_discrete() +
theme_minimal()+ theme(legend.position="none") +
labs(title = "World countries: life expectancy vs doctor density",
caption="Source: World Health Organization",
x = "doctor density", y = "Life expectancy at birth (in years)")
## Graph 4
ggplot(df_imput, aes(x=gob_expenditure, y=life_value, group=continent)) +
geom_point(alpha=0.9) + #geom_smooth(se=F, size=0.3) +
facet_wrap(~ continent) +
scale_color_discrete() +
theme_minimal()+ theme(legend.position="none") +
labs(title = "World countries: life expectancy vs goverment health expenditure ",
caption="Source: World Health Organization",
x = "goverment health expenditure", y = "Life expectancy at birth (in years)")
ggplot(df_imput, aes(x=gob_expenditure, y=life_value, group=continent)) +
geom_point(alpha=0.9) + #geom_smooth(se=F, size=0.3) +
facet_wrap(~ continent) +
scale_color_discrete() +
theme_minimal()+ theme(legend.position="none") +
labs(title = "World countries: life expectancy vs goverment health expenditure ",
caption="Source: World Health Organization",
x = "goverment health expenditure", y = "Life expectancy at birth (in years)")
df_imput |>
group_by(continent) |>
do(tidy(lm(life_value ~ gob_expenditure, data = .)))|>
select(continent, term, estimate) |>
spread(term, estimate)
## # A tibble: 6 × 3
## # Groups: continent [6]
## continent `(Intercept)` gob_expenditure
## <chr> <dbl> <dbl>
## 1 Africa 60.9 0.384
## 2 Asia 66.3 0.805
## 3 Europe 70.0 0.712
## 4 North America 67.3 0.530
## 5 Oceania 58.2 1.13
## 6 South America 65.0 0.792
g_Africa = df_imput |> select(country, continent, gob_expenditure, life_value) |>
filter(continent == "Africa") |>
ggplot(aes(x=gob_expenditure, y = life_value)) +
geom_point(alpha = 0.9) +
geom_abline(slope = 0.4058175, intercept = 60.74990, color = "red")
g_Asia = df_imput |> select(country, continent, gob_expenditure, life_value) |>
filter(continent == "Asia") |>
ggplot(aes(x=gob_expenditure, y = life_value)) +
geom_point(alpha = 0.9) +
geom_abline(slope = 0.8045696, intercept = 66.34065, color = "red")
g_Europe = df_imput |> select(country, continent, gob_expenditure, life_value) |>
filter(continent == "Europe") |>
ggplot(aes(x=gob_expenditure, y = life_value)) +
geom_point(alpha = 0.9) +
geom_abline(slope = 0.7120162, intercept = 70.04078, color = "red")
g_North_America = df_imput |> select(country, continent, gob_expenditure, life_value) |>
filter(continent == "North America") |>
ggplot(aes(x=gob_expenditure, y = life_value)) +
geom_point(alpha = 0.9) +
geom_abline(slope = 0.5301466, intercept = 67.31540, color = "red")
g_Oceania = df_imput |> select(country, continent, gob_expenditure, life_value) |>
filter(continent == "Oceania") |>
ggplot(aes(x=gob_expenditure, y = life_value)) +
geom_point(alpha = 0.9) +
geom_abline(slope = 1.1299704, intercept = 58.16559, color = "red")
g_South_America = df_imput |> select(country, continent, gob_expenditure, life_value) |>
filter(continent == "South America") |>
ggplot(aes(x=gob_expenditure, y = life_value)) +
geom_point(alpha = 0.9) +
geom_abline(slope = 0.7915068, intercept = 64.96808, color = "red")
library(patchwork)
g_Africa + g_Asia + g_Europe + g_North_America + g_Oceania + g_South_America +
plot_annotation(
title = "Merged Plot",
tag_levels = "a",
tag_suffix = ")"
)
ggplot(df_imput, aes(x=road_death, y=life_value, group=continent)) +
geom_point(alpha=0.9) + #geom_smooth(se=F, size=0.3) +
facet_wrap(~ continent) +
scale_color_discrete() +
theme_minimal()+ theme(legend.position="none") +
labs(title = "World countries: life expectancy vs road death",
caption="Source: World Health Organization",
x = "road death", y = "Life expectancy at birth (in years)")
df_imput |>
group_by(continent) |>
do(tidy(lm(life_value ~ road_death, data = .)))|>
select(continent, term, estimate) |>
spread(term, estimate)
## # A tibble: 6 × 3
## # Groups: continent [6]
## continent `(Intercept)` road_death
## <chr> <dbl> <dbl>
## 1 Africa 70.1 -0.259
## 2 Asia 74.5 -0.0731
## 3 Europe 80.7 -0.298
## 4 North America 76.2 -0.142
## 5 Oceania 73.3 -0.347
## 6 South America 88.3 -0.723
g_Africa2 = df_imput |> select(country, continent, road_death, life_value) |>
filter(continent == "Africa") |>
ggplot(aes(x=road_death, y = life_value)) +
geom_point(alpha = 0.9) +
geom_abline(slope = -0.25900186, intercept = 70.11643, color = "red")
g_Asia2 = df_imput |> select(country, continent, road_death, life_value) |>
filter(continent == "Asia") |>
ggplot(aes(x=road_death, y = life_value)) +
geom_point(alpha = 0.9) +
geom_abline(slope = -0.07312883, intercept = 74.50443, color = "red")
g_Europe2 = df_imput |> select(country, continent, road_death, life_value) |>
filter(continent == "Europe") |>
ggplot(aes(x=road_death, y = life_value)) +
geom_point(alpha = 0.9) +
geom_abline(slope = -0.29809254, intercept = 80.68437, color = "red")
g_North_America2 = df_imput |> select(country, continent, road_death, life_value) |>
filter(continent == "North America") |>
ggplot(aes(x=road_death, y = life_value)) +
geom_point(alpha = 0.9) +
geom_abline(slope = -0.14201323, intercept = 76.20349, color = "red")
g_Oceania2 = df_imput |> select(country, continent, road_death, life_value) |>
filter(continent == "Oceania") |>
ggplot(aes(x=road_death, y = life_value)) +
geom_point(alpha = 0.9) +
geom_abline(slope = -0.34742866, intercept = 73.30715, color = "red")
g_South_America2 = df_imput |> select(country, continent, road_death, life_value) |>
filter(continent == "South America") |>
ggplot(aes(x=road_death, y = life_value)) +
geom_point(alpha = 0.9) +
geom_abline(slope = -0.72257724, intercept = 88.29099, color = "red")
library(patchwork)
g_Africa2 + g_Asia2 + g_Europe2 + g_North_America2 + g_Oceania2 + g_South_America2 +
plot_annotation(
title = "Merged Plot",
tag_levels = "a",
tag_suffix = ")"
)
ggplot(df_imput, aes(x=birth_by_skilled, y=life_value, group=continent)) +
geom_point(alpha=0.9) + #geom_smooth(se=F, size=0.3) +
facet_wrap(~ continent) +
scale_color_discrete() +
theme_minimal()+ theme(legend.position="none") +
labs(title = "World countries: life expectancy vs Births attended by skilled health personnel",
caption="Source: World Health Organization",
x = "Births attended by skilled health personnel", y = "Life expectancy at birth (in years)")
df_imput |>
group_by(continent) |>
do(tidy(lm(life_value ~ birth_by_skilled, data = .)))|>
select(continent, term, estimate) |>
spread(term, estimate)
## # A tibble: 6 × 3
## # Groups: continent [6]
## continent `(Intercept)` birth_by_skilled
## <chr> <dbl> <dbl>
## 1 Africa 55.2 0.110
## 2 Asia 61.4 0.130
## 3 Europe 83.6 -0.0500
## 4 North America 66.0 0.0885
## 5 Oceania 42.8 0.304
## 6 South America 29.6 0.482
g_Asia3 = df_imput |> select(country, continent, birth_by_skilled, life_value) |>
filter(continent == "Asia") |>
ggplot(aes(x=birth_by_skilled, y = life_value)) +
geom_point(alpha = 0.9) +
geom_abline(slope = 0.15721923, intercept = 58.71548, color = "red")
g_North_America3 = df_imput |> select(country, continent, birth_by_skilled, life_value) |>
filter(continent == "North America") |>
ggplot(aes(x=birth_by_skilled, y = life_value)) +
geom_point(alpha = 0.9) +
geom_abline(slope = 0.19524893, intercept = 56.22726, color = "red")
library(patchwork)
g_Asia3 + g_North_America3 +
plot_annotation(
title = "Merged Plot",
tag_levels = "a",
tag_suffix = ")"
)
ggplot(df_imput, aes(x=maternal_death, y=life_value, group=continent)) +
geom_point(alpha=0.9) + #geom_smooth(se=F, size=0.3) +
facet_wrap(~ continent) +
scale_color_discrete() +
theme_minimal()+ theme(legend.position="none") +
labs(title = "World countries: life expectancy vs maternal mortality ratio",
caption="Source: World Health Organization",
x = "maternal mortality ratio", y = "Life expectancy at birth (in years)")
#install.packages("prcomp")
#library(prcomp)
table(is.na(df_imput))
##
## FALSE
## 1749
pca = prcomp(df_imput_n, scale = TRUE)
summary(pca)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 2.0892 0.9260 0.85387 0.7823 0.68531 0.62932 0.60479
## Proportion of Variance 0.5456 0.1072 0.09114 0.0765 0.05871 0.04951 0.04572
## Cumulative Proportion 0.5456 0.6528 0.74390 0.8204 0.87910 0.92861 0.97433
## PC8
## Standard deviation 0.45317
## Proportion of Variance 0.02567
## Cumulative Proportion 1.00000
library(factoextra)
fviz_screeplot(pca, addlabels = TRUE)
barplot(pca$rotation[,1], las=2, col="darkblue")
fviz_contrib(pca, choice = "var", axes = 1)
name[order(pca$x[,1])][1:10]
## [1] "Central African Republic" "Liberia"
## [3] "Guinea-Bissau" "Somalia"
## [5] "Benin" "Lesotho"
## [7] "Mali" "Zimbabwe"
## [9] "Chad" "Guinea"
name[order(pca$x[,1], decreasing=T)][1:10]
## [1] "Cuba" "Greece" "Austria" "Germany" "Spain" "Norway"
## [7] "Ireland" "Sweden" "Australia" "Uruguay"
barplot(pca$rotation[,2], las=2, col="darkblue")
name[order(pca$x[,2])][1:5]
## [1] "Saudi Arabia" "Kuwait" "El Salvador" "Zimbabwe" "Qatar"
name[order(pca$x[,2], decreasing=T)][1:5]
## [1] "India" "Papua New Guinea" "Greece" "Afghanistan"
## [5] "Bulgaria"
fviz_contrib(pca, choice = "var", axes = 2)
## plot first two scores
data.frame(z1=pca$x[,1],z2=pca$x[,2]) %>%
ggplot(aes(z1,z2,label=name,color=continent)) + geom_point(size=0) +
labs(title="First two principal components (scores)", x="PC1", y="PC2") + #guides(color=guide_legend(title="HDI"))+
theme_bw() +theme(legend.position="bottom") + geom_text(size=3, hjust=0.6, vjust=0, check_overlap = TRUE)
# The two first PCs seem independent
data.frame(z1=-pca$x[,1],Region=df_imput$country) %>% group_by(Region) %>% summarise(mean=mean(z1), n=n()) %>% arrange(desc(mean))
## # A tibble: 159 × 3
## Region mean n
## <chr> <dbl> <int>
## 1 Central African Republic 5.45 1
## 2 Liberia 4.36 1
## 3 Guinea-Bissau 4.14 1
## 4 Somalia 3.95 1
## 5 Benin 3.94 1
## 6 Lesotho 3.88 1
## 7 Mali 3.88 1
## 8 Zimbabwe 3.85 1
## 9 Chad 3.81 1
## 10 Guinea 3.79 1
## # ℹ 149 more rows
# Map our PCA index in a map:
map = data.frame(country=name, value=-pca$x[,1])
#Convert the country code into iso3c using the function countrycode()
map$country = countrycode(map$country, 'country.name', 'iso3c')
#Create data object supporting the map
matched <- joinCountryData2Map(map, joinCode = "ISO3",
nameJoinColumn = "country")
## 159 codes from your data successfully matched countries in the map
## 0 codes from your data failed to match with a country code in the map
## 84 codes from the map weren't represented in your data
#Draw the map
mapCountryData(matched,nameColumnToPlot="value",missingCountryCol = "white",
addLegend = TRUE, borderCol = "#C7D9FF",
catMethod = "pretty", colourPalette = "terrain",
mapTitle = c("PCA1 by Country"), lwd=1)
## You asked for 7 categories, 10 were used due to pretty() classification
## Clustering
fit_5 = kmeans(df_imput_n, centers = 5, nstart = 1000)
fit_4 = kmeans(df_imput_n, centers = 4, nstart = 1000)
fit_3 = kmeans(df_imput_n, centers = 3, nstart = 1000)
centers=fit_5$centers
barplot(centers[1,], las=2, col="darkblue")
barplot(centers[2,], las=2, col="darkblue")
barplot(centers[3,], las=2, col="darkblue")
barplot(centers[4,], las=2, col="darkblue")
barplot(centers[5,], las=2, col="darkblue")
centers=fit_4$centers
barplot(centers[1,], las=2, col="darkblue")
barplot(centers[2,], las=2, col="darkblue")
barplot(centers[3,], las=2, col="darkblue")
barplot(centers[4,], las=2, col="darkblue")
centers=fit_3$centers
barplot(centers[1,], las=2, col="darkblue")
barplot(centers[2,], las=2, col="darkblue")
barplot(centers[3,], las=2, col="darkblue")
fviz_cluster(fit_5, data = df_imput_n, geom = c("point"), ellipse.type = 'norm', pointsize = 1)+
theme_minimal() + geom_text(label = name, hjust = 0, vjust = 0,size = 2, check_overlap = F) + scale_fill_brewer(palette = "Paired")
## Too few points to calculate an ellipse
fviz_cluster(fit_4, data = df_imput_n, geom = c("point"), ellipse.type = 'norm', pointsize = 1)+
theme_minimal() + geom_text(label = name, hjust = 0, vjust = 0,size = 2, check_overlap = F) + scale_fill_brewer(palette = "Paired")
fviz_cluster(fit_3, data = df_imput_n, geom = c("point"), ellipse.type = 'norm', pointsize = 1)+
theme_minimal() + geom_text(label = name, hjust = 0, vjust = 0,size = 2, check_overlap = F) + scale_fill_brewer(palette = "Paired")
fviz_nbclust(scale(df_imput_n), kmeans, method = 'wss', k.max = 20, nstart = 1000)
# In this formula, the higher is the better
fviz_nbclust(scale(df_imput_n), kmeans, method = 'silhouette', k.max = 20, nstart = 1000)
# nboot : number of Bootsrapping
# if the line is on specific number then it means it is the maximum number of clusters
fviz_nbclust(scale(df_imput_n), kmeans, method = 'gap_stat', k.max = 10, nstart = 100, nboot = 100)
Insight
fit.km = kmeans(df_imput_n, centers = 3, nstart = 1000)
# Select here your favorite clustering tool
map = data.frame(country=name, value=fit.km$cluster)
#map = data.frame(country=names, value=fit.kmeans$cluster)
#Convert the country code into iso3c using the function countrycode()
map$country = countrycode(map$country, 'country.name', 'iso3c')
#Create data object supporting the map
matched <- joinCountryData2Map(map, joinCode = "ISO3",
nameJoinColumn = "country")
## 159 codes from your data successfully matched countries in the map
## 0 codes from your data failed to match with a country code in the map
## 84 codes from the map weren't represented in your data
#Draw the map
mapCountryData(matched,nameColumnToPlot="value",missingCountryCol = "white",
borderCol = "#C7D9FF",
catMethod = "pretty", colourPalette = "rainbow",
mapTitle = c("Clusters"), lwd=1)
## You asked for 7 categories, 4 were used due to pretty() classification
d = dist(scale(df_imput_n), method = "euclidean")
hc = hclust(d, method = "ward.D2")
hc$labels <- name
fviz_dend(x = hc,
k=3,
palette = "jco",
rect = TRUE, rect_fill = TRUE, cex=0.5,
rect_border = "jco"
)
## Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
## of ggplot2 3.3.4.
## ℹ The deprecated feature was likely used in the factoextra package.
## Please report the issue at <https://github.com/kassambara/factoextra/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
fviz_dend(x = hc,
k = 3,
color_labels_by_k = TRUE,
cex = 0.8,
type = "phylogenic",
repel = TRUE)+ labs(title="Death reasons tree clustering of the world") + theme(axis.text.x=element_blank(),axis.text.y=element_blank())
groups.hc = cutree(hc, k = 8)
# Map our PCA index in a map:
map = data.frame(country=name, value=groups.hc)
#Convert the country code into iso3c using the function countrycode()
map$country = countrycode(map$country, 'country.name', 'iso3c')
#Create data object supporting the map
matched <- joinCountryData2Map(map, joinCode = "ISO3",
nameJoinColumn = "country")
## 159 codes from your data successfully matched countries in the map
## 0 codes from your data failed to match with a country code in the map
## 84 codes from the map weren't represented in your data
#Draw the map
mapCountryData(matched,nameColumnToPlot="value",missingCountryCol = "white",
borderCol = "#C7D9FF",
catMethod = "pretty", colourPalette = "rainbow",
mapTitle = c("Clusters"), lwd=1)
heatmap(scale(df_imput_n), scale = "none", labRow = name,
distfun = function(x){dist(x, method = "euclidean")},
hclustfun = function(x){hclust(x, method = "ward.D2")},
cexRow = 0.7)